{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "# ! pip install modelscope[nlp]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "ename": "ImportError",
     "evalue": "cannot import name 'AutoTokenizer' from 'modelscope' (/home/ubuntu/code/chat-glm3-deployment-fine-tuning/.conda/lib/python3.11/site-packages/modelscope/__init__.py)",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mImportError\u001b[0m                               Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m \u001b[38;5;28;01mimport\u001b[39;00m \u001b[38;5;21;01mtorch\u001b[39;00m\n\u001b[0;32m----> 2\u001b[0m \u001b[38;5;28;01mfrom\u001b[39;00m \u001b[38;5;21;01mmodelscope\u001b[39;00m \u001b[38;5;28;01mimport\u001b[39;00m AutoTokenizer, AutoModel, snapshot_download\n\u001b[1;32m      3\u001b[0m model_dir \u001b[38;5;241m=\u001b[39m snapshot_download(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mZhipuAI/chatglm3-6b\u001b[39m\u001b[38;5;124m\"\u001b[39m, revision \u001b[38;5;241m=\u001b[39m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mv1.0.0\u001b[39m\u001b[38;5;124m\"\u001b[39m)\n\u001b[1;32m      4\u001b[0m tokenizer \u001b[38;5;241m=\u001b[39m AutoTokenizer\u001b[38;5;241m.\u001b[39mfrom_pretrained(model_dir, trust_remote_code\u001b[38;5;241m=\u001b[39m\u001b[38;5;28;01mTrue\u001b[39;00m)\n",
      "\u001b[0;31mImportError\u001b[0m: cannot import name 'AutoTokenizer' from 'modelscope' (/home/ubuntu/code/chat-glm3-deployment-fine-tuning/.conda/lib/python3.11/site-packages/modelscope/__init__.py)"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "from modelscope import AutoTokenizer, AutoModel, snapshot_download\n",
    "model_dir = snapshot_download(\"ZhipuAI/chatglm3-6b\", revision = \"v1.0.0\")\n",
    "tokenizer = AutoTokenizer.from_pretrained(model_dir, trust_remote_code=True)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "2.4.1+cu118\n",
      "True\n",
      "11.8\n"
     ]
    }
   ],
   "source": [
    "import torch\n",
    "\n",
    "print(torch.__version__)\n",
    "print(torch.cuda.is_available())\n",
    "print(torch.version.cuda)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards:   0%|          | 0/7 [00:00<?, ?it/s]"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Loading checkpoint shards: 100%|██████████| 7/7 [00:00<00:00,  9.61it/s]\n"
     ]
    }
   ],
   "source": [
    "with torch.no_grad():\n",
    "    #采用量化方案的ChatGLM3\n",
    "    model = AutoModel.from_pretrained(model_dir, trust_remote_code=True).quantize(4).cuda()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [
    {
     "ename": "TypeError",
     "evalue": "GenerationMixin._extract_past_from_model_output() got an unexpected keyword argument 'standardize_cache_format'",
     "output_type": "error",
     "traceback": [
      "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
      "\u001b[0;31mTypeError\u001b[0m                                 Traceback (most recent call last)",
      "Cell \u001b[0;32mIn[7], line 2\u001b[0m\n\u001b[1;32m      1\u001b[0m model \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39meval()\n\u001b[0;32m----> 2\u001b[0m response, history \u001b[38;5;241m=\u001b[39m \u001b[43mmodel\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mchat\u001b[49m\u001b[43m(\u001b[49m\u001b[43mtokenizer\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[38;5;124;43m你好\u001b[39;49m\u001b[38;5;124;43m\"\u001b[39;49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mhistory\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43m[\u001b[49m\u001b[43m]\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m      3\u001b[0m \u001b[38;5;28mprint\u001b[39m(response)\n\u001b[1;32m      4\u001b[0m response, history \u001b[38;5;241m=\u001b[39m model\u001b[38;5;241m.\u001b[39mchat(tokenizer, \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m晚上睡不着应该怎么办\u001b[39m\u001b[38;5;124m\"\u001b[39m, history\u001b[38;5;241m=\u001b[39mhistory)\n",
      "File \u001b[0;32m/workspace/chat-glm3-deployment-fine-tuning/.conda/lib/python3.11/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    115\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py:1035\u001b[0m, in \u001b[0;36mChatGLMForConditionalGeneration.chat\u001b[0;34m(self, tokenizer, query, history, role, max_length, num_beams, do_sample, top_p, temperature, logits_processor, **kwargs)\u001b[0m\n\u001b[1;32m   1032\u001b[0m inputs \u001b[38;5;241m=\u001b[39m inputs\u001b[38;5;241m.\u001b[39mto(\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   1033\u001b[0m eos_token_id \u001b[38;5;241m=\u001b[39m [tokenizer\u001b[38;5;241m.\u001b[39meos_token_id, tokenizer\u001b[38;5;241m.\u001b[39mget_command(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|user|>\u001b[39m\u001b[38;5;124m\"\u001b[39m),\n\u001b[1;32m   1034\u001b[0m                 tokenizer\u001b[38;5;241m.\u001b[39mget_command(\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124m<|observation|>\u001b[39m\u001b[38;5;124m\"\u001b[39m)]\n\u001b[0;32m-> 1035\u001b[0m outputs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mgenerate\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43minputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mgen_kwargs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43meos_token_id\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43meos_token_id\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   1036\u001b[0m outputs \u001b[38;5;241m=\u001b[39m outputs\u001b[38;5;241m.\u001b[39mtolist()[\u001b[38;5;241m0\u001b[39m][\u001b[38;5;28mlen\u001b[39m(inputs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124minput_ids\u001b[39m\u001b[38;5;124m\"\u001b[39m][\u001b[38;5;241m0\u001b[39m]):\u001b[38;5;241m-\u001b[39m\u001b[38;5;241m1\u001b[39m]\n\u001b[1;32m   1037\u001b[0m response \u001b[38;5;241m=\u001b[39m tokenizer\u001b[38;5;241m.\u001b[39mdecode(outputs)\n",
      "File \u001b[0;32m/workspace/chat-glm3-deployment-fine-tuning/.conda/lib/python3.11/site-packages/torch/utils/_contextlib.py:116\u001b[0m, in \u001b[0;36mcontext_decorator.<locals>.decorate_context\u001b[0;34m(*args, **kwargs)\u001b[0m\n\u001b[1;32m    113\u001b[0m \u001b[38;5;129m@functools\u001b[39m\u001b[38;5;241m.\u001b[39mwraps(func)\n\u001b[1;32m    114\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mdecorate_context\u001b[39m(\u001b[38;5;241m*\u001b[39margs, \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mkwargs):\n\u001b[1;32m    115\u001b[0m     \u001b[38;5;28;01mwith\u001b[39;00m ctx_factory():\n\u001b[0;32m--> 116\u001b[0m         \u001b[38;5;28;01mreturn\u001b[39;00m \u001b[43mfunc\u001b[49m\u001b[43m(\u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43margs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mkwargs\u001b[49m\u001b[43m)\u001b[49m\n",
      "File \u001b[0;32m/workspace/chat-glm3-deployment-fine-tuning/.conda/lib/python3.11/site-packages/transformers/generation/utils.py:2024\u001b[0m, in \u001b[0;36mGenerationMixin.generate\u001b[0;34m(self, inputs, generation_config, logits_processor, stopping_criteria, prefix_allowed_tokens_fn, synced_gpus, assistant_model, streamer, negative_prompt_ids, negative_prompt_attention_mask, **kwargs)\u001b[0m\n\u001b[1;32m   2016\u001b[0m     input_ids, model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_expand_inputs_for_generation(\n\u001b[1;32m   2017\u001b[0m         input_ids\u001b[38;5;241m=\u001b[39minput_ids,\n\u001b[1;32m   2018\u001b[0m         expand_size\u001b[38;5;241m=\u001b[39mgeneration_config\u001b[38;5;241m.\u001b[39mnum_return_sequences,\n\u001b[1;32m   2019\u001b[0m         is_encoder_decoder\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39mconfig\u001b[38;5;241m.\u001b[39mis_encoder_decoder,\n\u001b[1;32m   2020\u001b[0m         \u001b[38;5;241m*\u001b[39m\u001b[38;5;241m*\u001b[39mmodel_kwargs,\n\u001b[1;32m   2021\u001b[0m     )\n\u001b[1;32m   2023\u001b[0m     \u001b[38;5;66;03m# 13. run sample (it degenerates to greedy search when `generation_config.do_sample=False`)\u001b[39;00m\n\u001b[0;32m-> 2024\u001b[0m     result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_sample\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   2025\u001b[0m \u001b[43m        \u001b[49m\u001b[43minput_ids\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2026\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlogits_processor\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_processor\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2027\u001b[0m \u001b[43m        \u001b[49m\u001b[43mlogits_warper\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_logits_warper\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2028\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstopping_criteria\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mprepared_stopping_criteria\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2029\u001b[0m \u001b[43m        \u001b[49m\u001b[43mgeneration_config\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mgeneration_config\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2030\u001b[0m \u001b[43m        \u001b[49m\u001b[43msynced_gpus\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43msynced_gpus\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2031\u001b[0m \u001b[43m        \u001b[49m\u001b[43mstreamer\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstreamer\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2032\u001b[0m \u001b[43m        \u001b[49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[38;5;241;43m*\u001b[39;49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   2033\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   2035\u001b[0m \u001b[38;5;28;01melif\u001b[39;00m generation_mode \u001b[38;5;129;01min\u001b[39;00m (GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SAMPLE, GenerationMode\u001b[38;5;241m.\u001b[39mBEAM_SEARCH):\n\u001b[1;32m   2036\u001b[0m     \u001b[38;5;66;03m# 11. prepare logits warper\u001b[39;00m\n\u001b[1;32m   2037\u001b[0m     prepared_logits_warper \u001b[38;5;241m=\u001b[39m (\n\u001b[1;32m   2038\u001b[0m         \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_get_logits_warper(generation_config, device\u001b[38;5;241m=\u001b[39minput_ids\u001b[38;5;241m.\u001b[39mdevice)\n\u001b[1;32m   2039\u001b[0m         \u001b[38;5;28;01mif\u001b[39;00m generation_config\u001b[38;5;241m.\u001b[39mdo_sample\n\u001b[1;32m   2040\u001b[0m         \u001b[38;5;28;01melse\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m\n\u001b[1;32m   2041\u001b[0m     )\n",
      "File \u001b[0;32m/workspace/chat-glm3-deployment-fine-tuning/.conda/lib/python3.11/site-packages/transformers/generation/utils.py:3032\u001b[0m, in \u001b[0;36mGenerationMixin._sample\u001b[0;34m(self, input_ids, logits_processor, stopping_criteria, generation_config, synced_gpus, streamer, logits_warper, **model_kwargs)\u001b[0m\n\u001b[1;32m   3030\u001b[0m \u001b[38;5;28;01mif\u001b[39;00m streamer \u001b[38;5;129;01mis\u001b[39;00m \u001b[38;5;129;01mnot\u001b[39;00m \u001b[38;5;28;01mNone\u001b[39;00m:\n\u001b[1;32m   3031\u001b[0m     streamer\u001b[38;5;241m.\u001b[39mput(next_tokens\u001b[38;5;241m.\u001b[39mcpu())\n\u001b[0;32m-> 3032\u001b[0m model_kwargs \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_update_model_kwargs_for_generation\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m   3033\u001b[0m \u001b[43m    \u001b[49m\u001b[43moutputs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3034\u001b[0m \u001b[43m    \u001b[49m\u001b[43mmodel_kwargs\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3035\u001b[0m \u001b[43m    \u001b[49m\u001b[43mis_encoder_decoder\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mconfig\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mis_encoder_decoder\u001b[49m\u001b[43m,\u001b[49m\n\u001b[1;32m   3036\u001b[0m \u001b[43m\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m   3038\u001b[0m unfinished_sequences \u001b[38;5;241m=\u001b[39m unfinished_sequences \u001b[38;5;241m&\u001b[39m \u001b[38;5;241m~\u001b[39mstopping_criteria(input_ids, scores)\n\u001b[1;32m   3039\u001b[0m this_peer_finished \u001b[38;5;241m=\u001b[39m unfinished_sequences\u001b[38;5;241m.\u001b[39mmax() \u001b[38;5;241m==\u001b[39m \u001b[38;5;241m0\u001b[39m\n",
      "File \u001b[0;32m~/.cache/huggingface/modules/transformers_modules/chatglm3-6b/modeling_chatglm.py:871\u001b[0m, in \u001b[0;36mChatGLMForConditionalGeneration._update_model_kwargs_for_generation\u001b[0;34m(self, outputs, model_kwargs, is_encoder_decoder, standardize_cache_format)\u001b[0m\n\u001b[1;32m    863\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21m_update_model_kwargs_for_generation\u001b[39m(\n\u001b[1;32m    864\u001b[0m         \u001b[38;5;28mself\u001b[39m,\n\u001b[1;32m    865\u001b[0m         outputs: ModelOutput,\n\u001b[0;32m   (...)\u001b[0m\n\u001b[1;32m    869\u001b[0m ) \u001b[38;5;241m-\u001b[39m\u001b[38;5;241m>\u001b[39m Dict[\u001b[38;5;28mstr\u001b[39m, Any]:\n\u001b[1;32m    870\u001b[0m     \u001b[38;5;66;03m# update past_key_values\u001b[39;00m\n\u001b[0;32m--> 871\u001b[0m     model_kwargs[\u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mpast_key_values\u001b[39m\u001b[38;5;124m\"\u001b[39m] \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_extract_past_from_model_output\u001b[49m\u001b[43m(\u001b[49m\n\u001b[1;32m    872\u001b[0m \u001b[43m        \u001b[49m\u001b[43moutputs\u001b[49m\u001b[43m,\u001b[49m\u001b[43m \u001b[49m\u001b[43mstandardize_cache_format\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mstandardize_cache_format\u001b[49m\n\u001b[1;32m    873\u001b[0m \u001b[43m    \u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m    875\u001b[0m     \u001b[38;5;66;03m# update attention mask\u001b[39;00m\n\u001b[1;32m    876\u001b[0m     \u001b[38;5;28;01mif\u001b[39;00m \u001b[38;5;124m\"\u001b[39m\u001b[38;5;124mattention_mask\u001b[39m\u001b[38;5;124m\"\u001b[39m \u001b[38;5;129;01min\u001b[39;00m model_kwargs:\n",
      "\u001b[0;31mTypeError\u001b[0m: GenerationMixin._extract_past_from_model_output() got an unexpected keyword argument 'standardize_cache_format'"
     ]
    }
   ],
   "source": [
    "model = model.eval()\n",
    "response, history = model.chat(tokenizer, \"你好\", history=[])\n",
    "print(response)\n",
    "response, history = model.chat(tokenizer, \"晚上睡不着应该怎么办\", history=history)\n",
    "print(response)"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.11.9"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 2
}
